Lab6: Evaluating RSF Models

Today we will cover the statistical evaluation of model goodness of fit from a predictive perspective. First we will cover the statistical approaches for evaluating used-unused designs. Second we will build on the statistical model evaluation to the most important aspect of evaluating RSF models – how well they actually predict observed animal locations. We will build on previous labs statistical development to create spatial layers of spatial variables that were in the top models you selected last week (either prey- or habitat-based) top RSF models. We will learn about the special challenge of evaluating logistic regression models of binary response data. We will learn about classification tables, sensitivity, specificity, and ROC curves to evaluate the ‘optimal’ cutpoint of prediction in logistic models. Next, we will use k-folds cross validation to conduct in-sample model validation for the RSF. Finally, we will also explore how different graphical display options for spatial predictions influence the interpretation of the ‘habitat map’.
0.1 Preliminaries: setting packages
0.1 Preliminaries: setting working directory
##### 0.1 Preliminaries: setting working directory #######################################################################

## define working directory on each computer
wd_laptop <- "/Users/mark.hebblewhite/Dropbox/WILD 562/Spring2017/lab6/"
wd_desktop <- "/Users/mark.hebblewhite/Documents/Teaching/UofMcourses/WILD562/Spring2017/Labs/lab6" 

## automatically set working directory depending which computer you're on
ifelse(file.exists(wd_laptop), setwd(wd_laptop), setwd(wd_desktop)) 
## [1] "/Users/mark.hebblewhite/Dropbox/WILD 562/Spring2017/lab6"
getwd()
## [1] "/Users/mark.hebblewhite/Dropbox/WILD 562/Spring2017/lab6"
list.files() ## handy command to see what is inside the working directory
##  [1] "clumpydata.pdf"                   "homerangeALL.dbf"                
##  [3] "homerangeALL.prj"                 "homerangeALL.shp"                
##  [5] "homerangeALL.shx"                 "kxv.pdf"                         
##  [7] "kxv.R"                            "lab6.html"                       
##  [9] "lab6.R"                           "lab6.rmd"                        
## [11] "Lec_12_Evaluating RSFs.pptx"      "pronghorn.csv"                   
## [13] "rasters"                          "rasters.zip"                     
## [15] "readings"                         "sampleK-folds_spreadsheet.xls"   
## [17] "WILD562_Lab6_Evaluating RSF.docx" "wolfkde.csv"                     
## [19] "wolfmcp.csv"
0.2 Saving and loading data and shapefile data of Kernel home range from Lab
wolfkde2 <- read.csv("wolfkde.csv", header=TRUE, sep = ",", na.strings="NA", dec=".")
wolfkde3 <-na.omit(wolfkde2)
wolfkde3$usedFactor <-as.factor(wolfkde3$usedFactor) ## make sure usedFactor is a factor
# head(wolfkde3)
length(wolfkde3$used)
## [1] 2118
#source("/Users/mark.hebblewhite/Dropbox/WILD 562/Spring2017/lab4/new/Lab2NeededforLab5.R", verbose = FALSE)
# plot(homerangeALL)
#writeOGR(homerangeALL, dsn=wd_laptop, layer = "homerangeALL", driver = "ESRI Shapefile", overwrite_layer=TRUE)

kernelHR <- readOGR(dsn=wd_laptop, "homerangeALL")
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/mark.hebblewhite/Dropbox/WILD 562/Spring2017/lab6/", layer: "homerangeALL"
## with 2 features
## It has 2 fields
plot(kernelHR)

extent(kernelHR)
## class       : Extent 
## xmin        : 546836.6 
## xmax        : 612093.9 
## ymin        : 5662036 
## ymax        : 5748911
kernels <- raster()
extent(kernels) <- c(xmin=546836, xmax=612093, ymin=5662036, ymax=5748911) 
0.3 Loading raster’s needed for mapping RSF models later
setwd("/Users/mark.hebblewhite/Dropbox/WILD 562/Spring2017/lab6/rasters")
#list.files()
deer_w<-raster("deer_w2.tif")
moose_w<-raster("moose_w2.tif")
elk_w<-raster("elk_w2.tif") # already brought in above
sheep_w<-raster("sheep_w2.tif")
goat_w<-raster("goat_w2.tif")
wolf_w<-raster("wolf_w2.tif")
elevation2<-raster("Elevation2.tif") #resampled
disthumanaccess2<-raster("DistFromHumanAccess2.tif") #resampled in lab 4
disthhu2<-raster("DistFromHighHumanAccess2.tif") #resampled in lab 4
landcover2 <- raster("landcover2.tif") ## resampled to same extent as lab 4
## but note we need to repopulate the fields with the habitat legend information
landcover2@data@values <- getValues(landcover2)

## note that the extents are all different for human access and elc_habitat-derived layers, so need to recreate a new extent
#create an empty raster
mask.raster <- raster()

#set extent (note that I customized this extent so it covered both elc_habitat and humanacess)
extent(mask.raster) <- c(xmin=443680.6, xmax=650430.4, ymin=5618405, ymax=5789236)  

#set the resolution to 30 m 
res(mask.raster)<-30

#match projection to elc_habitat shapefile
projection(mask.raster)<- "+proj=utm +zone=11 +datum=NAD83 +units=m +no_defs +ellps=GRS80 +towgs84=0,0,0"

#set all values of mask.raster to zero
mask.raster[]<-0

0.3 Creating ‘dummy’ variable landcover rasters for use in mapping later
# duplicate mask.raster in the creation of new 'dummy' rasters
alpine <- mask.raster
burn <- mask.raster
closedConif <- mask.raster
herb <- mask.raster
mixed <- mask.raster
rockIce <- mask.raster
water <- mask.raster
modConif <- mask.raster
decid <- mask.raster

#set values for empty rasters based on landcover2 values of Habitat classification variables
alpine@data@values <- ifelse(landcover2@data@values== 15 | landcover2@data@values == 16, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
burn@data@values <- ifelse(landcover2@data@values == 12 | landcover2@data@values == 13 | landcover2@data@values == 14, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
closedConif@data@values <- ifelse(landcover2@data@values == 3, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
herb@data@values <- ifelse(landcover2@data@values == 7, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
mixed@data@values <- ifelse(landcover2@data@values == 5, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
rockIce@data@values <- ifelse(landcover2@data@values == 10, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
water@data@values <- ifelse(landcover2@data@values == 9, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
modConif@data@values <- ifelse(landcover2@data@values == 2, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
decid@data@values <- ifelse(landcover2@data@values == 10, 1, ifelse(is.na(landcover2@data@values)==T,NA,0))
plot(rockIce)
plot(kernelHR, add=TRUE)

plot(closedConif)
plot(kernelHR, add=TRUE)

# note that open conifer as intercept

0.3 Creating a raster stack ## not really needed this lab.
#stack raster layers (i.e., create raster stack for sampling; must have same extent and resolution)
all_rasters<-stack(deer_w, moose_w, elk_w, sheep_w, goat_w, wolf_w,elevation2, disthumanaccess2, disthhu2, landcover2, alpine, burn, closedConif, modConif, herb, mixed, rockIce, water, decid)

Objective 1.0 Multiple Logistic Regression & Collinearity
We will first evaluate collinearity between Distance to High Human Use and Elevation
##### Running the 'top' models from last week

## top Biotic model was model 41
top.biotic <- glm(used ~ DistFromHumanAccess2+deer_w2 + goat_w2, family=binomial(logit), data=wolfkde3)
summary(top.biotic)
## 
## Call:
## glm(formula = used ~ DistFromHumanAccess2 + deer_w2 + goat_w2, 
##     family = binomial(logit), data = wolfkde3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8345  -0.6246  -0.1888  -0.0306   3.1247  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -3.553037   0.366553  -9.693  < 2e-16 ***
## DistFromHumanAccess2 -0.001422   0.000183  -7.770 7.86e-15 ***
## deer_w2               0.898069   0.077941  11.522  < 2e-16 ***
## goat_w2              -0.333540   0.048524  -6.874 6.26e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2040.9  on 2117  degrees of freedom
## Residual deviance: 1398.2  on 2114  degrees of freedom
## AIC: 1406.2
## 
## Number of Fisher Scoring iterations: 7
#double check the VIF for each final model, just to be sure
vif(top.biotic)
## DistFromHumanAccess2              deer_w2              goat_w2 
##             1.039378             1.020137             1.022622
# Environmental Model - top model is model 11
top.env <- glm(used ~ Elevation2 + DistFromHighHumanAccess2 + openConif+modConif+closedConif+mixed+herb+shrub+water+burn, family=binomial(logit), data=wolfkde3)
summary(top.env)
## 
## Call:
## glm(formula = used ~ Elevation2 + DistFromHighHumanAccess2 + 
##     openConif + modConif + closedConif + mixed + herb + shrub + 
##     water + burn, family = binomial(logit), data = wolfkde3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0290  -0.5020  -0.1576  -0.0366   3.2732  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               9.570e+00  8.805e-01  10.869  < 2e-16 ***
## Elevation2               -6.782e-03  4.883e-04 -13.888  < 2e-16 ***
## DistFromHighHumanAccess2  1.867e-04  3.511e-05   5.317 1.05e-07 ***
## openConif                 8.457e-01  4.404e-01   1.920   0.0548 .  
## modConif                 -1.716e-02  3.836e-01  -0.045   0.9643    
## closedConif              -1.126e-01  3.944e-01  -0.286   0.7752    
## mixed                     1.325e+00  5.435e-01   2.438   0.0148 *  
## herb                      8.564e-01  5.525e-01   1.550   0.1212    
## shrub                     5.781e-01  4.486e-01   1.289   0.1974    
## water                     8.559e-01  6.389e-01   1.340   0.1804    
## burn                      1.861e+00  4.629e-01   4.021 5.80e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2040.9  on 2117  degrees of freedom
## Residual deviance: 1298.3  on 2107  degrees of freedom
## AIC: 1320.3
## 
## Number of Fisher Scoring iterations: 7
vif(top.env)
##               Elevation2 DistFromHighHumanAccess2                openConif 
##                 2.186429                 2.026714                 2.892384 
##                 modConif              closedConif                    mixed 
##                 7.720150                 5.317916                 1.857145 
##                     herb                    shrub                    water 
##                 1.778124                 2.733747                 1.515043 
##                     burn 
##                 2.322895
# Calculate model selection AIC table
require(AICcmodavg)
models = list(top.biotic, top.env)
modnames = c("envfinal", "bioticmodel")
aictab(models, modnames)
## 
## Model selection based on AICc:
## 
##              K    AICc Delta_AICc AICcWt Cum.Wt      LL
## bioticmodel 11 1320.41       0.00      1      1 -649.14
## envfinal     4 1406.25      85.85      0      1 -699.12
1.1 Evaluating predictions from residual plots for top.env model
# First review here in Lab http://www.statmethods.net/stats/rdiagnostics.html for Linear Regression (Normal regression)

plot(top.env)

## this cylces through residual plots which look decidedly different from 'normal' residual plots

##### Saving predictions manually, an example with the environment model
wolfkde3$fitted.top.env <- fitted(top.env)
#### this is the predicted probability from the model

wolfkde3$residuals.top.env <- residuals(top.env)
## these are the deviations from the predictions for each row (data point)

wolfkde3$rstudent.top.env <- rstudent(top.env)
## This is a standardized residual - the studentized residual

wolfkde3$hatvalues.top.env <- hatvalues(top.env)
#### this is the first of the leverage statistics, the larger hat value is, the bigger the influence on the fitted value

wolfkde3$cooks.distance.top.env <- cooks.distance(top.env)
#### this is the Cooks leverage statistic, the larger hat value is, the bigger the influence on the fitted value

wolfkde3$obsNumber <- 1:nrow(wolfkde3) ## just added a row number for plotting

## Making manual residual verus predicted plots
plot(wolfkde3$fitted.top.env, wolfkde3$residuals.top.env)

ggplot(wolfkde3, aes(fitted.top.env, residuals.top.env)) + geom_point() + geom_text(aes(label = obsNumber, colour = used))

## Manual residual plot

ggplot(wolfkde3, aes(wolfkde3$residuals.top.env, wolfkde3$cooks.distance.top.env)) + geom_point() + geom_text(aes(label = obsNumber, colour = used))

## shows us some points at high cooks values that might be having a big influence

ggplot(wolfkde3, aes(wolfkde3$cooks.distance.top.env, wolfkde3$hatvalues.top.env)) + geom_point() + geom_text(aes(label = obsNumber, colour = used))

## shows us some points at high cooks values that might be having a big influence
## this helps identify some locations that have high leverage that are used (1 - blue) and available (0=black) points

## e.g., datapoint 16
wolfkde3[16,]
##    deer_w2 moose_w2 elk_w2 sheep_w2 goat_w2 wolf_w2 Elevation2
## 19       4        4      4        4       3       4   2086.334
##    DistFromHumanAccess2 DistFromHighHumanAccess2 landcover16 EASTING
## 19             1289.253                 1595.905           1  570610
##    NORTHING     pack used usedFactor  habitatType    landcov.f closedConif
## 19  5710430 Red Deer    1          1 Open Conifer Open Conifer           0
##    modConif openConif decid regen mixed herb shrub water rockIce burn
## 19        0         1     0     0     0    0     0     0       0    0
##    alpineHerb alpineShrub alpine fitted.top.env residuals.top.env
## 19          0           0      0     0.03119327          2.633459
##    rstudent.top.env hatvalues.top.env cooks.distance.top.env obsNumber
## 19         2.654942       0.003644757              0.0103663        16
## is a red deer wolf used point at high elevtion in open conifer far from human access. 
## Does not seem to be a data entry error - a REAL data point


#Evaluating model fit graphically
# first lets make a plot of Y against X predictions...
plot(wolfkde3$Elevation2, wolfkde3$fitted.top.env)

##### Another way of looking at it
scatterplot(fitted.top.env~Elevation2, reg.line=lm, smooth=TRUE, spread=TRUE, boxplots='xy', 
            span=0.5, xlab="elevation", ylab="residual", cex=1.5, cex.axis=1.4, cex.lab=1.4, 
            data=wolfkde3)

hist(wolfkde3$fitted.top.env, scale="frequency", breaks="Sturges", col="darkgray")

#### This plot is VERY important - it is the predicted probability of a location being a wolf used location given your top model. But how would we divide this into wolf habitat and wolf available?

ggplot(wolfkde3, aes(x=wolfkde3$fitted.top.env, fill=usedFactor)) + geom_histogram(binwidth=0.05, position="identity", alpha=0.7) + xlab("Predicted Probability of Wolf Use") + theme(axis.title.x=element_text(size=16)) #+ facet_grid(pack ~ ., scales="free")

This plot shows that somewhere around 0.25 - 0.40 eyeballing it it looks like we could ‘cut’ used and available points?
ggplot(wolfkde3, aes(x=fitted.top.env, y=..density.., fill=usedFactor)) + geom_histogram(binwidth=0.05, position="identity", alpha=0.7) + xlab("Predicted Probability of Wolf Use") + theme(axis.title.x=element_text(size=16)) + facet_grid(pack ~ ., scales="free")

###### But note this ‘cut’ point looks different for both wolf packs? This introduces the basic problem of dividing continuous predictions from logistic regression into categories of habitat (1) and available (0)

1.2 Evaluating predictions from residual plots for top.biotic model
plot(top.biotic)

##### Saving predictions manually, an example with the environment model
wolfkde3$fitted.top.biotic <- fitted(top.biotic)
#### this is the predicted probability from the model

wolfkde3$residuals.top.biotic <- residuals(top.biotic)
## these are the deviations from the predictions for each row (data point)

wolfkde3$rstudent.top.biotic <- rstudent(top.biotic)
## This is a standardized residual - the studentized residual

wolfkde3$hatvalues.top.biotic <- hatvalues(top.biotic)
#### this is the first of the leverage statistics, the larger hat value is, the bigger the influence on the fitted value

wolfkde3$cooks.distance.top.biotic <- cooks.distance(top.biotic)
#### This isthe Cooks leverage statistic


## Making manual residual verus predicted plots
plot(wolfkde3$fitted.top.biotic, wolfkde3$residuals.top.biotic)

ggplot(wolfkde3, aes(wolfkde3$cooks.distance.top.biotic, wolfkde3$hatvalues.top.biotic)) + geom_point() + geom_text(aes(label = obsNumber, colour = used))

## this helps identify some locations that have high leverage that are used (1 - blue) and available (0=black) points

## e.g., datapoint 16
wolfkde3[13,]
##    deer_w2 moose_w2 elk_w2 sheep_w2 goat_w2 wolf_w2 Elevation2
## 14       4        4      4        3       3       4   2007.453
##    DistFromHumanAccess2 DistFromHighHumanAccess2 landcover16 EASTING
## 14               2105.2                 9342.662           2  575450
##    NORTHING     pack used usedFactor      habitatType        landcov.f
## 14  5717684 Red Deer    1          1 Moderate Conifer Moderate Conifer
##    closedConif modConif openConif decid regen mixed herb shrub water
## 14           0        1         0     0     0     0    0     0     0
##    rockIce burn alpineHerb alpineShrub alpine fitted.top.env
## 14       0    0          0           0      0     0.08969027
##    residuals.top.env rstudent.top.env hatvalues.top.env
## 14          2.196084         2.206977       0.004703072
##    cooks.distance.top.env obsNumber fitted.top.biotic residuals.top.biotic
## 14            0.004380538        13        0.01881658             2.818871
##    rstudent.top.biotic hatvalues.top.biotic cooks.distance.top.biotic
## 14            2.836193           0.00187503                0.02453511
## is a red deer wolf used point at high elevtion in open conifer far from human access that is being classified as an AVAILABLE location. 
wolfkde3[30,]
##    deer_w2 moose_w2 elk_w2 sheep_w2 goat_w2 wolf_w2 Elevation2
## 35       3        4      4        4       4       4   2169.713
##    DistFromHumanAccess2 DistFromHighHumanAccess2 landcover16 EASTING
## 35             1704.151                 9264.565           2  580310
##    NORTHING     pack used usedFactor      habitatType        landcov.f
## 35  5708450 Red Deer    1          1 Moderate Conifer Moderate Conifer
##    closedConif modConif openConif decid regen mixed herb shrub water
## 35           0        1         0     0     0     0    0     0     0
##    rockIce burn alpineHerb alpineShrub alpine fitted.top.env
## 35       0    0          0           0      0     0.03129583
##    residuals.top.env rstudent.top.env hatvalues.top.env
## 35          2.632212         2.644415       0.002075902
##    cooks.distance.top.env obsNumber fitted.top.biotic residuals.top.biotic
## 35            0.005865753        30       0.009800049             3.041502
##    rstudent.top.biotic hatvalues.top.biotic cooks.distance.top.biotic
## 35            3.053085         0.0006981523                0.01766003
## another high wolf used point that is being classified as an AVAILABLE location

#Evaluating model fit graphically
# first lets make a plot of Y against X predictions...
plot(wolfkde3$Elevation2, wolfkde3$fitted.top.biotic)

##### Another way of looking at it
scatterplot(fitted.top.biotic~Elevation2, reg.line=lm, smooth=TRUE, spread=TRUE, boxplots='xy', 
            span=0.5, xlab="elevation", ylab="residual", cex=1.5, cex.axis=1.4, cex.lab=1.4, 
            data=wolfkde3)

hist(wolfkde3$fitted.top.biotic, scale="frequency", breaks="Sturges", col="darkgray")


hist(wolfkde3$fitted.top.biotic, scale="frequency", breaks="Sturges", col="darkgray")

##### This plot is VERY important - it is the predicted probability of a location being a wolf used location given your top model
##### But how would we divide this into wolf habitat and wolf available? 

ggplot(wolfkde3, aes(x=wolfkde3$fitted.top.biotic, fill=usedFactor)) + geom_histogram(binwidth=0.05, position="identity", alpha=0.7) + xlab("Predicted Probability of Wolf Use") + theme(axis.title.x=element_text(size=16)) #+ facet_grid(pack ~ ., scales="free")

#### This plot shows that somewhere around 0.25 - 0.40 eyeballing it it looks like we could 'cut' used and available points? 

ggplot(wolfkde3, aes(x=fitted.top.biotic, y=..density.., fill=usedFactor)) + geom_histogram(binwidth=0.05, position="identity", alpha=0.7) + xlab("Predicted Probability of Wolf Use") + theme(axis.title.x=element_text(size=16)) + facet_grid(pack ~ ., scales="free")

#### But note this 'cut' point looks different for both wolf packs?
1.3. Comparing the ‘fit’ of the biotic and environmental models
ggplot(wolfkde3, aes(x=fitted.top.biotic, y=fitted.top.env, fill = pack)) + geom_point() + stat_smooth(method="lm")

## there is quite a bit of scatter here in the predictions between the two models, but in general, they are highly correlated

ggplot(wolfkde3, aes(x=fitted.top.biotic, y=fitted.top.env, fill = pack)) + geom_point() + stat_smooth()

## but some evidence that the top environmental model is not succesfulyl predicting the 'best' wolf habitat especially in the bow valley wolf pack compared to the top biotic model
2.0 Classification Tables from top models
2.1 Classification table for top biotic model
# First we will arbitrarily define the cutpoint between 1 and 0's using p = 0.5
ppused = wolfkde3$fitted.top.biotic>0.5
table(ppused,wolfkde3$used)
##        
## ppused     0    1
##   FALSE 1655  229
##   TRUE    67  167
### now go through Hosmer and Lemeshow chapter 5 to calculate our classification success for 0's?
167/(167+229)
## [1] 0.4217172
So when wolf telemetry locations were known = 1, the model classified 42% as ‘used’. This is pretty terrible? This is also called the Specificity of a Model.
Now lets do the correct classification rate of the true 0’s (i.e., avail)
1655/(1655+67)
## [1] 0.9610918
But when the points were really 0’s, available, we classified them 96% of the time correctly. This is called the Sensitivity of a model.
Now ltes do this manually using cutpoints of 0.25 and 0.1
## Now ltes do this manually using cutpoints of 0.25 and 0.1
ppused = wolfkde3$fitted.top.biotic>0.25
table(ppused,wolfkde3$used)
##        
## ppused     0    1
##   FALSE 1376   92
##   TRUE   346  304
#### Now, what is specificity? (i.e., the probability of classifying the 1's correctly?)
304/(304+92)
## [1] 0.7676768
#### about 76% - Great! But - what happened to our sensitivity (i.e., the probability of classifying the 0's correctly?)
1376 / (1376+346)
## [1] 0.7990708
#### so our probability of classifying 0's correctly decreases to ~ 80% with our sensitivity

### now lets try a p = 0.10
ppused = wolfkde3$fitted.top.biotic>0.10
table(ppused,wolfkde3$used)
##        
## ppused     0    1
##   FALSE 1001   39
##   TRUE   721  357
#### Now, what is specificity? (i.e., the probability of classifying the 1's correctly?)
357/(357+39)
## [1] 0.9015152
#### about 90% - Great! But - what happened to our sensitivity (i.e., the probability of classifying the 0's correctly?)
1001 / (1001+721)
## [1] 0.5813008
#### so our probability of classifying 0's correctly decreases with our sensitivity
Now lets calculate the Confusion Matrix from the package caret
require(caret)

wolfkde3$pr.top.biotic.used <- ifelse(wolfkde3$fitted.top.biotic>0.5, 1, 0)
xtab1<-table(wolfkde3$pr.top.biotic.used, wolfkde3$used)
xtab1
##    
##        0    1
##   0 1655  229
##   1   67  167
#?confusionMatrix
confusionMatrix(xtab1)
## Confusion Matrix and Statistics
## 
##    
##        0    1
##   0 1655  229
##   1   67  167
##                                           
##                Accuracy : 0.8602          
##                  95% CI : (0.8447, 0.8747)
##     No Information Rate : 0.813           
##     P-Value [Acc > NIR] : 4.668e-09       
##                                           
##                   Kappa : 0.4544          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9611          
##             Specificity : 0.4217          
##          Pos Pred Value : 0.8785          
##          Neg Pred Value : 0.7137          
##              Prevalence : 0.8130          
##          Detection Rate : 0.7814          
##    Detection Prevalence : 0.8895          
##       Balanced Accuracy : 0.6914          
##                                           
##        'Positive' Class : 0               
## 
### This reveals a LOT of information - lets compare these values to the table in the ? confusionMatrix help file, and in the lab manual. 
Excercise: Redo for the top.env model

3.0 Receiver Operating Characterstic ROC curves

ROCR package help here: https://rocr.bioinf.mpi-sb.mpg.de

3.1 Evaluating the top biotic model

require(ROCR)
pp = predict(top.biotic,type="response")

pred = prediction(pp, wolfkde3$used)
perfClass <- performance(pred, "tpr","fpr") # change 2nd and/or 3rd arguments for other metrics
plot(perfClass)

#Calculate the Area Under Curve
BMauc <- performance(pred, measure="auc") 
str(BMauc)
## Formal class 'performance' [package "ROCR"] with 6 slots
##   ..@ x.name      : chr "None"
##   ..@ y.name      : chr "Area under the ROC curve"
##   ..@ alpha.name  : chr "none"
##   ..@ x.values    : list()
##   ..@ y.values    :List of 1
##   .. ..$ : num 0.868
##   ..@ alpha.values: list()
auc <- as.numeric(BMauc@y.values)
auc
## [1] 0.8678605
#Max the sum of sensitivity and specificity 
fpr <- perfClass@x.values[[1]]
tpr <- perfClass@y.values[[1]]
sum <- tpr + (1-fpr)
index <- which.max(sum)
cutoff <- perfClass@alpha.values[[1]][[index]]
cutoff
## [1] 0.2363158
## thus the cutpoint that maximizes the overall classification succes is 0.236

#Plot ROC Curve with cut point and AUC
plot(perfClass, colorize = T, lwd = 5, print.cutoffs.at=seq(0,1,by=0.1),
     text.adj=c(1.2,1.2),
     main = "ROC Curve")
text(0.5, 0.5, "AUC = 0.867")
abline(v=cutoff, col = "red", lwd = 3)

Now lets go back and use this cutoff to calculate the Confusion Matrix ourself
### now lets try a p = of our cutoff
ppused = wolfkde3$fitted.top.biotic>cutoff
table(ppused,wolfkde3$used)
##        
## ppused     0    1
##   FALSE 1344   76
##   TRUE   378  320
#### Now, what is specificity? (i.e., the probability of classifying the 1's correctly?)
320/(320+76)
## [1] 0.8080808
#### about 80% - Great! But - what happened to our sensitivity (i.e., the probability of classifying the 0's correctly?)
1344 / (1344+378)
## [1] 0.7804878
#### so our probability of classifying 0's correctly is about 78%

wolfkde3$pr.top.biotic.used2 <- ifelse(wolfkde3$fitted.top.biotic>cutoff, 1, 0)
xtab2<-table(wolfkde3$pr.top.biotic.used2, wolfkde3$used)
xtab2
##    
##        0    1
##   0 1344   76
##   1  378  320
#?confusionMatrix
confusionMatrix(xtab2)
## Confusion Matrix and Statistics
## 
##    
##        0    1
##   0 1344   76
##   1  378  320
##                                          
##                Accuracy : 0.7856         
##                  95% CI : (0.7675, 0.803)
##     No Information Rate : 0.813          
##     P-Value [Acc > NIR] : 0.9993         
##                                          
##                   Kappa : 0.455          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.7805         
##             Specificity : 0.8081         
##          Pos Pred Value : 0.9465         
##          Neg Pred Value : 0.4585         
##              Prevalence : 0.8130         
##          Detection Rate : 0.6346         
##    Detection Prevalence : 0.6704         
##       Balanced Accuracy : 0.7943         
##                                          
##        'Positive' Class : 0              
## 
Now lets use this cutoff to classify used and avail locations into 1 and 0’s, and make a plot of where this cutoff is using geom_vline() in ggplot
## this is our best model classifying used and avail locations into 1 and 0's. 
ggplot(wolfkde3, aes(x=wolfkde3$fitted.top.biotic, fill=usedFactor)) + geom_histogram(binwidth=0.05, position="identity", alpha=0.7) + xlab("Predicted Probability of Wolf Use") + theme(axis.title.x=element_text(size=16)) + geom_vline(xintercept = cutoff, col="red")

##### This graph shows the optimal cutpoint based on our data

Finally, this next step calculates the default expected prevalence of 1’s and 0’s in our sample; compare that to the optimal cutpoint.
table(wolfkde3$used)
## 
##    0    1 
## 1722  396
396/(1722+396)
## [1] 0.1869688
## note that its VERY similar to our sampling fraction - the fact that it is bigger than the basic sampling fraction is interesting, but its essentially determined by the U/(U+A)

3.2 Evaluating the top Environmental Model - on your own.

4.0 k-folkds cross validation

We are going to load the custom function kxv.R from source
See the kxv.R details for information about this function. It was basically programmed for Boyce for the 2002 paper
4.1 evaluating the top Environmental Model
source("/Users/mark.hebblewhite/Dropbox/WILD 562/Spring2017/lab6/kxv.R", verbose = FALSE)
## Module kxv.R loaded.
## 
## Type kxvtest() to run the following example of a fixed-effects
## model with sample Elk data:
## 
## elk <- read.csv(file="HR_Summer.csv", header=T)
## kxvglm( PTTYPE~DEM30M+DEM30M^2+VARCES1+FORGSUM+FORGSUM^2, elk, k=5,nbin=10)
## 
## The test will do kxvPrintFlag <- TRUE in order to
## print a summary of each fitted model, but this
## flag is FALSE by default.
## 
## To turn off plotting, do kxvPlotFlag <- FALSE.
Now fit a 5-fold k=5 cross validation of the top.env model
# Kfolds with a 'fuzz' factor
kxvPrintFlag=FALSE
kxvPlotFlag=TRUE
kxvFuzzFactor = 0.01
kfolds = kxvglm(top.env$formula, data=wolfkde3, k=5, nbin=10)

kfolds
##                r.rho            p
## 1          0.9478045 3.048217e-05
## 2          0.7754764 8.393730e-03
## 3          0.9535451 1.926125e-05
## 4          0.9374369 6.212522e-05
## 5          0.9629500 7.882961e-06
## (meanfreq) 0.9665698 5.248116e-06
##### These values tell you the spearman rank correlation between every subset of the data 1 - 5 and the predicted correlation between the # of observations in ranked categories of habitat from 1, 10. 

##### But note that this K-folds cross validation does not address the structure of the data within wolf packs
##### NExt step is to subset by wolf pack and see how well the overall wolf model predicts wolf use by both wolf packs
# Kfolds by each pack with a 'fuzz' factor
kxvPrintFlag=FALSE
kxvPlotFlag=TRUE
kxvFuzzFactor = 0.01
kfolds2 = kxvglm(top.env$formula, data=wolfkde3, k=5, nbin=10, partition="pack")

kfolds2
##                 r.rho           p
## Bow Valley  0.7963563 0.005836997
## Red Deer   -0.2324257 0.518156318
## (meanfreq)  0.7575758 0.015920829
So the answer is that the overall pooled model predicts the Bow Valley pack really well, but fails to predict the Red Deer pack any better, essentially, than random

4.2 evaluating the top Biotic Model - on your own

4.3 Manual k-folds cross-validation

But lets evaluate the steps of at least 1 loop (k=1 of 5) MANUALLY so we can see exactly what k-folds is doing
# Create a vector of random "folds" in this case 5, 1:5
wolfkde3$rand.vec = sample(1:5,nrow(wolfkde3),replace=TRUE)

#Run the model for 1 single random subset of the data == 1
top.env.1= glm(used ~ Elevation2 + DistFromHighHumanAccess2 + openConif+modConif+closedConif+mixed+herb+shrub+water+burn, family=binomial(logit), data=wolfkde3, subset=rand.vec==1)

# Make predictions for points not used in this random subset (2:5) to fit the model.
pred.prob = predict(top.env.1,newdata=wolfkde3[wolfkde3$rand.vec!=1,],type="response")

# Make quantiles for the predictions - this calculates the 'bin's of the categories of habitat availability
q.pp = quantile(pred.prob,probs=seq(0,1,.1))

# Then for each of 10 bins, put each row of data into a bin
bin = rep(NA,length(pred.prob))
for (i in 1:10){
    bin[pred.prob>=q.pp[i]&pred.prob<q.pp[i+1]] = i
}

## This then makes a count of just the used locations for all other K folds 2:5 
used1 = wolfkde3$used[wolfkde3$rand.vec!=1]

## We then make a table of them
rand.vec.1.table <- table(used1,bin)
rand.vec.1.table
##      bin
## used1   1   2   3   4   5   6   7   8   9  10
##     0 172 164 171 163 166 159 134 137  80  53
##     1   0   8   2   8   6  14  38  34  92 119
## this basically shows the data in each bin for used and available locations. If the model is any 'good', then high ranking habitat bins (7, 8, 9, 10) should have a lot more used locaitons in them. 
cor.test(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), c(0,2,0,8,6,15,24,50,99,110), method="spearman") 
## 
##  Spearman's rank correlation rho
## 
## data:  c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) and c(0, 2, 0, 8, 6, 15, 24, 50, 99, 110)
## S = 5.516, p-value = 5.248e-06
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9665698
## which suggests that in this random fold of the data, the model predicted habitat use well

5.0 Mapping spatial predictions of ‘top’ model

5.1 Easy spatial prediction just in ggplot
par(mfrow = c(1,1))

###### 5.1 Easy spatial predictions
ggplot(wolfkde3, aes(EASTING, NORTHING, col = fitted.top.biotic)) + geom_point(size=5) + coord_equal() +  scale_colour_gradient(low = 'yellow', high = 'red')

ggplot(wolfkde3, aes(EASTING, NORTHING, col = fitted.top.env)) + geom_point(size=5) + coord_equal() +  scale_colour_gradient(low = 'yellow', high = 'red')

##### What did we just do? What is this a map of?

5.2 Raster predictions for top Biotic Model

Note that the final step takes a long time.
par(mfrow = c(1,1))
summary(top.biotic)
## 
## Call:
## glm(formula = used ~ DistFromHumanAccess2 + deer_w2 + goat_w2, 
##     family = binomial(logit), data = wolfkde3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8345  -0.6246  -0.1888  -0.0306   3.1247  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -3.553037   0.366553  -9.693  < 2e-16 ***
## DistFromHumanAccess2 -0.001422   0.000183  -7.770 7.86e-15 ***
## deer_w2               0.898069   0.077941  11.522  < 2e-16 ***
## goat_w2              -0.333540   0.048524  -6.874 6.26e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2040.9  on 2117  degrees of freedom
## Residual deviance: 1398.2  on 2114  degrees of freedom
## AIC: 1406.2
## 
## Number of Fisher Scoring iterations: 7
#> top.biotic$coefficients
#(Intercept) DistFromHumanAccess2              deer_w2              goat_w2 
#-3.553037526         -0.001421547          0.898069385         -0.333539833 

biotic.coefs <- top.biotic$coefficients[c(1:4)]
names(all_rasters)
##  [1] "deer_w2"                  "moose_w2"                
##  [3] "elk_w2"                   "sheep_w2"                
##  [5] "goat_w2"                  "wolf_w2"                 
##  [7] "Elevation2"               "DistFromHumanAccess2"    
##  [9] "DistFromHighHumanAccess2" "landcover2"              
## [11] "layer.1"                  "layer.2"                 
## [13] "layer.3"                  "layer.4"                 
## [15] "layer.5"                  "layer.6"                 
## [17] "layer.7"                  "layer.8"                 
## [19] "layer.9"
##### Note that this step takes a long time.
rast.top.biotic <- exp(biotic.coefs[1] + biotic.coefs[2]*disthumanaccess2 + biotic.coefs[3]*deer_w + biotic.coefs[4]*goat_w) / (1 +exp(biotic.coefs[1] + biotic.coefs[2]*disthumanaccess2 + biotic.coefs[3]*deer_w + biotic.coefs[4]*goat_w ))
## need to use the names of the raster layers we brought in up above. Note that they are not the same names as stored in the Raster stack

# plot predicted raster
plot(rast.top.biotic, col=colorRampPalette(c("yellow", "orange", "red"))(255))

plot(rast.top.biotic, col=colorRampPalette(c("yellow", "orange", "red"))(255), ext=kernels)
plot(kernelHR, add=TRUE)

#look at histogram of predicted values
#hist(rast.top.biotic@data@values) ## note this isnt working for some reasong
To do 5.4 - Mapping JUST the numerator from the RSF from a Used-Available Design - and then comparing with some random points
rast.top.biotic.RSF <- exp(biotic.coefs[1] + biotic.coefs[2]*disthumanaccess2 + biotic.coefs[3]*deer_w + biotic.coefs[4]*goat_w) 
plot(rast.top.biotic.RSF, col=colorRampPalette(c("yellow", "orange", "red"))(255), extent=kernels)